Prediction using Supervised ML

Graduate Rotational Internship Program The Sparks Foundation

Data Science & Business Analytics Tasks

Prediction using Supervised ML

Introduction

  • Linear Regression with R Programming (R-studio)

  • In this project the we can see how a Simple Linear Regression can be used for prediction for the percentage of an student based on the no. of study hours.

  • It is implementing using R programming -rmarkdown report.

  • We will start with simple linear regression involving two variables.

Problem Statement

  • In this we need to predict the percentage of marks based on the number of hours they study.
  • Here we are using two variables Hours and Scores for the regression.

Data Collection

  • Collecting the student data from url
  • Reading the data into csv file
library(readr)
library(tidyverse)
library(tidymodels)
library(ggplot2)
student_scoresURL <- "https://raw.githubusercontent.com/AdiPersonalWorks/Random/master/student_scores%20-%20student_scores.csv"

student_scores <- read.csv(file = student_scoresURL, 
                  header = TRUE,
                     sep = ",",
                     stringsAsFactors = FALSE)
student_scores
##    Hours Scores
## 1    2.5     21
## 2    5.1     47
## 3    3.2     27
## 4    8.5     75
## 5    3.5     30
## 6    1.5     20
## 7    9.2     88
## 8    5.5     60
## 9    8.3     81
## 10   2.7     25
## 11   7.7     85
## 12   5.9     62
## 13   4.5     41
## 14   3.3     42
## 15   1.1     17
## 16   8.9     95
## 17   2.5     30
## 18   1.9     24
## 19   6.1     67
## 20   7.4     69
## 21   2.7     30
## 22   4.8     54
## 23   3.8     35
## 24   6.9     76
## 25   7.8     86
  • writing/saving the student_scores file into local directory
write.table(student_scores, file = "student_scores.csv ",
            row.names = FALSE,
            col.names = TRUE,
            sep = "\t",
            quote = FALSE)
  • Checking Basic Information about the student_scores data
  • Checking summary and structure of the data
head(student_scores)
##   Hours Scores
## 1   2.5     21
## 2   5.1     47
## 3   3.2     27
## 4   8.5     75
## 5   3.5     30
## 6   1.5     20
dim(student_scores)
## [1] 25  2
str(student_scores) 
## 'data.frame':    25 obs. of  2 variables:
##  $ Hours : num  2.5 5.1 3.2 8.5 3.5 1.5 9.2 5.5 8.3 2.7 ...
##  $ Scores: int  21 47 27 75 30 20 88 60 81 25 ...
summary(student_scores) 
##      Hours           Scores     
##  Min.   :1.100   Min.   :17.00  
##  1st Qu.:2.700   1st Qu.:30.00  
##  Median :4.800   Median :47.00  
##  Mean   :5.012   Mean   :51.48  
##  3rd Qu.:7.400   3rd Qu.:75.00  
##  Max.   :9.200   Max.   :95.00

Data Preparation

library(caTools)
set <- sample(2, nrow(student_scores), 
              replace = TRUE, 
              prob = c(0.7, 0.3))
train <- student_scores[set==1,]
train
##    Hours Scores
## 1    2.5     21
## 2    5.1     47
## 3    3.2     27
## 4    8.5     75
## 5    3.5     30
## 6    1.5     20
## 7    9.2     88
## 8    5.5     60
## 9    8.3     81
## 10   2.7     25
## 11   7.7     85
## 12   5.9     62
## 13   4.5     41
## 14   3.3     42
## 16   8.9     95
## 17   2.5     30
## 18   1.9     24
## 19   6.1     67
## 20   7.4     69
## 21   2.7     30
## 24   6.9     76
## 25   7.8     86
test <- student_scores[set==2,]
test
##    Hours Scores
## 15   1.1     17
## 22   4.8     54
## 23   3.8     35

Data Cleaning

  • Checking for Missing Values/NA/NAN
library(DataExplorer)
sum(is.na(train))
## [1] 0
sum(is.na(test))
## [1] 0
plot_missing(train)

# Exploratory Data Analysis (EDA) * describe - can computes the statistics of all numerical variables

library(Hmisc)
describe(train)
## train 
## 
##  2  Variables      22  Observations
## --------------------------------------------------------------------------------
## Hours 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       22        0       20    0.999    5.255    2.968    1.930    2.500 
##      .25      .50      .75      .90      .95 
##    2.825    5.300    7.625    8.480    8.880 
## 
## lowest : 1.5 1.9 2.5 2.7 3.2, highest: 7.8 8.3 8.5 8.9 9.2
##                                                                             
## Value        1.5   1.9   2.5   2.7   3.2   3.3   3.5   4.5   5.1   5.5   5.9
## Frequency      1     1     2     2     1     1     1     1     1     1     1
## Proportion 0.045 0.045 0.091 0.091 0.045 0.045 0.045 0.045 0.045 0.045 0.045
##                                                                 
## Value        6.1   6.9   7.4   7.7   7.8   8.3   8.5   8.9   9.2
## Frequency      1     1     1     1     1     1     1     1     1
## Proportion 0.045 0.045 0.045 0.045 0.045 0.045 0.045 0.045 0.045
## --------------------------------------------------------------------------------
## Scores 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##       22        0       20    0.998    53.68    29.83    21.15    24.10 
##      .25      .50      .75      .90      .95 
##    30.00    53.50    75.75    85.90    87.90 
## 
## lowest : 20 21 24 25 27, highest: 81 85 86 88 95
##                                                                             
## Value         20    21    24    25    27    30    41    42    47    60    62
## Frequency      1     1     1     1     1     3     1     1     1     1     1
## Proportion 0.045 0.045 0.045 0.045 0.045 0.136 0.045 0.045 0.045 0.045 0.045
##                                                                 
## Value         67    69    75    76    81    85    86    88    95
## Frequency      1     1     1     1     1     1     1     1     1
## Proportion 0.045 0.045 0.045 0.045 0.045 0.045 0.045 0.045 0.045
## --------------------------------------------------------------------------------
describe(test)
## test 
## 
##  2  Variables      3  Observations
## --------------------------------------------------------------------------------
## Hours 
##        n  missing distinct     Info     Mean      Gmd 
##        3        0        3        1    3.233    2.467 
##                             
## Value        1.1   3.8   4.8
## Frequency      1     1     1
## Proportion 0.333 0.333 0.333
## --------------------------------------------------------------------------------
## Scores 
##        n  missing distinct     Info     Mean      Gmd 
##        3        0        3        1    35.33    24.67 
##                             
## Value         17    35    54
## Frequency      1     1     1
## Proportion 0.333 0.333 0.333
## --------------------------------------------------------------------------------
  • Two continuous variables
library(ggplot2)

q <- ggplot(data = train, aes(x = Hours, y = Scores))+
  geom_line(colour = "skyblue") +
  geom_point(colour = "blue")
q

ggplot(data=train,aes(x=Hours,y=Scores)) + 
  geom_bar(stat ='identity',aes(fill=Scores))+
  coord_flip() + 
  theme_grey() + 
  scale_fill_gradient(name="Score Level")+
  labs(title = 'Scores according to hours',
       y='Score per hour',x='Hours of study')+ 
  geom_hline(yintercept = mean(student_scores$Scores),size = 1, color = 'red')

library(plotly)
plot_ly(train, x = ~Hours, y = ~Scores, type = 'bar', mode = 'markers',marker = list(color = "DARKGREEN", opacity = 0.5), size = 4) %>%  
  layout(title = 'Scores according to hours', 
                       yaxis = list(title = 'Scores'), 
                       xaxis = list(title = 'Hours of study') )

Modelling

  • Simple Linear Regression
model <- lm( Scores~  Hours,  data=train) #model building
model
## 
## Call:
## lm(formula = Scores ~ Hours, data = train)
## 
## Coefficients:
## (Intercept)        Hours  
##       1.887        9.857
  • Summary of model
summary(model) 
## 
## Call:
## lm(formula = Scores ~ Hours, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.673  -5.223   1.727   4.713   7.585 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.887      2.881   0.655     0.52    
## Hours          9.857      0.496  19.873 1.22e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.761 on 20 degrees of freedom
## Multiple R-squared:  0.9518, Adjusted R-squared:  0.9494 
## F-statistic: 394.9 on 1 and 20 DF,  p-value: 1.219e-14
#plot
plot(Scores~  Hours,  train, pch =4, frame = FALSE, col  = "red")
abline(model, col = "blue")

Model Prediction

# Prediction
cof <- coef(summary(model))
cof
##             Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) 1.886629  2.8812274  0.6548003 5.200567e-01
## Hours       9.857216  0.4960066 19.8731554 1.219422e-14
pred  <- predict(model,train)
pred
##        1        2        3        4        5        6        7        8 
## 26.52967 52.15843 33.42972 85.67296 36.38688 16.67245 92.57302 56.10132 
##        9       10       11       12       13       14       16       17 
## 83.70152 28.50111 77.78719 60.04420 46.24410 34.41544 89.61585 26.52967 
##       18       19       20       21       24       25 
## 20.61534 62.01565 74.83003 28.50111 69.90142 78.77291
train_pred <- data.frame(train,pred)
train_pred
##    Hours Scores     pred
## 1    2.5     21 26.52967
## 2    5.1     47 52.15843
## 3    3.2     27 33.42972
## 4    8.5     75 85.67296
## 5    3.5     30 36.38688
## 6    1.5     20 16.67245
## 7    9.2     88 92.57302
## 8    5.5     60 56.10132
## 9    8.3     81 83.70152
## 10   2.7     25 28.50111
## 11   7.7     85 77.78719
## 12   5.9     62 60.04420
## 13   4.5     41 46.24410
## 14   3.3     42 34.41544
## 16   8.9     95 89.61585
## 17   2.5     30 26.52967
## 18   1.9     24 20.61534
## 19   6.1     67 62.01565
## 20   7.4     69 74.83003
## 21   2.7     30 28.50111
## 24   6.9     76 69.90142
## 25   7.8     86 78.77291

Comparing actual values with Predicted values

Scores_comp <- data.frame(Actual  = train$Scores, Predcited = train_pred$pred)
Scores_comp
##    Actual Predcited
## 1      21  26.52967
## 2      47  52.15843
## 3      27  33.42972
## 4      75  85.67296
## 5      30  36.38688
## 6      20  16.67245
## 7      88  92.57302
## 8      60  56.10132
## 9      81  83.70152
## 10     25  28.50111
## 11     85  77.78719
## 12     62  60.04420
## 13     41  46.24410
## 14     42  34.41544
## 15     95  89.61585
## 16     30  26.52967
## 17     24  20.61534
## 18     67  62.01565
## 19     69  74.83003
## 20     30  28.50111
## 21     76  69.90142
## 22     86  78.77291

Actual vs Predicted Graph

Actual <- c(Scores_comp$Actual)
Predcited <- c(Scores_comp$Predcited)
plot(Actual, type = "b", frame = FALSE, pch = 10, 
     col = "#4AC6B7", xlab = "x", ylab = "y")

lines(Predcited, pch = 10, col = "#C61951", type = "b", lty = 2)

legend("topleft", legend=c("Actual", "Predicted"), col=c("#4AC6B7", "#C61951"), 
       pch = 10, lty = 1:1, cex=0.6)

Model Diagnostics

par(mfrow=c(2,2))

plot(model)

## Test data Evoluation

pred  <- predict(model,test)
tibble(pred)
## # A tibble: 3 x 1
##    pred
##   <dbl>
## 1  12.7
## 2  49.2
## 3  39.3
test_pred <- data.frame(test,pred)
test_pred
##    Hours Scores     pred
## 15   1.1     17 12.72957
## 22   4.8     54 49.20127
## 23   3.8     35 39.34405

Actual vs Predicted graph

Scores_comp2 <- data.frame(Actual  = test$Scores, Predcited = test_pred$pred)
Scores_comp2
##   Actual Predcited
## 1     17  12.72957
## 2     54  49.20127
## 3     35  39.34405

The predicted score if a student studies for 9.25 hrs/ day

  • Predicting score if a student studies for 9.25 hrs/day
pred_hour  <-  predict(model, data.frame(Hours=9.25)) 
pred_hour
##        1 
## 93.06588
#(or)
coef(model)[1] + 9.25*coef(model)[2] #Mathematical form
## (Intercept) 
##    93.06588
library(highcharter)

hc <- train_pred_new %>% group_by(Scores, Hours) %>%
  hchart('column', hcaes(x = 'Hours', y = 'Scores') ) 

hc